EDA

Load Library

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'tibble' was built under R version 4.4.2
## Warning: package 'tidyr' was built under R version 4.4.2
## Warning: package 'readr' was built under R version 4.4.2
## Warning: package 'purrr' was built under R version 4.4.2
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'stringr' was built under R version 4.4.2
## Warning: package 'forcats' was built under R version 4.4.2
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.2
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.4.2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.2
## corrplot 0.95 loaded
library(knitr)
## Warning: package 'knitr' was built under R version 4.4.2

Load Data

wines <- read.csv("/Users/oyunm/Desktop/SMU/DS-6306-Doing-the-Data-Science/Project 2/Wine Train Set.csv")
head(wines)
##   ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1  1           7.2             0.34        0.34           12.6     0.048
## 2  2           6.0             0.27        0.28            4.8     0.063
## 3  3           6.9             0.26        0.49            1.6     0.058
## 4  4           6.6             0.25        0.34            3.0     0.054
## 5  5           7.1             0.17        0.43            1.3     0.023
## 6  6           6.0             0.29        0.25            1.4     0.033
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol  type
## 1                   7                   41 0.99420 3.19      0.40    11.7 white
## 2                  31                  201 0.99640 3.69      0.71    10.0 white
## 3                  39                  166 0.99650 3.65      0.52     9.4 white
## 4                  22                  141 0.99338 3.26      0.47    10.4 white
## 5                  33                  132 0.99067 3.11      0.56    11.7 white
## 6                  30                  114 0.98794 3.08      0.43    13.2 white
##     location quality
## 1      Texas       5
## 2      Texas       5
## 3      Texas       4
## 4 California       6
## 5 California       6
## 6 California       6

Exploration of data

There are all 14 variables, except for ID, where there are three categorical variables(quality, location, type) and the remaining 11 variables are numeric

summary(wines)
##        ID       fixed.acidity    volatile.acidity  citric.acid    
##  Min.   :   1   Min.   : 3.800   Min.   :0.0800   Min.   :0.0000  
##  1st Qu.:1366   1st Qu.: 6.400   1st Qu.:0.2300   1st Qu.:0.2500  
##  Median :2732   Median : 7.000   Median :0.2900   Median :0.3100  
##  Mean   :2732   Mean   : 7.218   Mean   :0.3382   Mean   :0.3185  
##  3rd Qu.:4098   3rd Qu.: 7.700   3rd Qu.:0.4000   3rd Qu.:0.3900  
##  Max.   :5463   Max.   :15.900   Max.   :1.5800   Max.   :1.6600  
##  residual.sugar    chlorides       free.sulfur.dioxide total.sulfur.dioxide
##  Min.   : 0.60   Min.   :0.00900   Min.   :  1.00      Min.   :  6.0       
##  1st Qu.: 1.80   1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 78.0       
##  Median : 3.00   Median :0.04700   Median : 29.00      Median :118.0       
##  Mean   : 5.42   Mean   :0.05613   Mean   : 30.58      Mean   :115.9       
##  3rd Qu.: 8.10   3rd Qu.:0.06500   3rd Qu.: 41.00      3rd Qu.:155.0       
##  Max.   :31.60   Max.   :0.61100   Max.   :289.00      Max.   :440.0       
##     density             pH          sulphates         alcohol    
##  Min.   :0.9871   Min.   :2.720   Min.   :0.2200   Min.   : 8.0  
##  1st Qu.:0.9923   1st Qu.:3.110   1st Qu.:0.4300   1st Qu.: 9.5  
##  Median :0.9949   Median :3.210   Median :0.5100   Median :10.3  
##  Mean   :0.9947   Mean   :3.217   Mean   :0.5318   Mean   :10.5  
##  3rd Qu.:0.9969   3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:11.3  
##  Max.   :1.0103   Max.   :4.010   Max.   :2.0000   Max.   :14.9  
##      type             location            quality     
##  Length:5463        Length:5463        Min.   :3.000  
##  Class :character   Class :character   1st Qu.:5.000  
##  Mode  :character   Mode  :character   Median :6.000  
##                                        Mean   :5.823  
##                                        3rd Qu.:6.000  
##                                        Max.   :9.000
str(wines)
## 'data.frame':    5463 obs. of  15 variables:
##  $ ID                  : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ fixed.acidity       : num  7.2 6 6.9 6.6 7.1 6 7.2 6.8 9.1 7.8 ...
##  $ volatile.acidity    : num  0.34 0.27 0.26 0.25 0.17 0.29 0.57 0.45 0.27 0.32 ...
##  $ citric.acid         : num  0.34 0.28 0.49 0.34 0.43 0.25 0.06 0.3 0.32 0.33 ...
##  $ residual.sugar      : num  12.6 4.8 1.6 3 1.3 1.4 1.6 11.8 1.1 10.4 ...
##  $ chlorides           : num  0.048 0.063 0.058 0.054 0.023 0.033 0.076 0.094 0.031 0.031 ...
##  $ free.sulfur.dioxide : num  7 31 39 22 33 30 9 23 15 47 ...
##  $ total.sulfur.dioxide: num  41 201 166 141 132 114 27 97 151 194 ...
##  $ density             : num  0.994 0.996 0.997 0.993 0.991 ...
##  $ pH                  : num  3.19 3.69 3.65 3.26 3.11 3.08 3.36 3.09 3.03 3.07 ...
##  $ sulphates           : num  0.4 0.71 0.52 0.47 0.56 0.43 0.7 0.44 0.41 0.58 ...
##  $ alcohol             : num  11.7 10 9.4 10.4 11.7 13.2 9.6 9.6 10.6 9.6 ...
##  $ type                : chr  "white" "white" "white" "white" ...
##  $ location            : chr  "Texas" "Texas" "Texas" "California" ...
##  $ quality             : int  5 5 4 6 6 6 6 5 5 6 ...

Boxplots of Quality vs each numeric variable

A box plot of fixed.acidity vs. quality

Fixed Acidity measures the natural acids in wine that contribute to its structure, flavor, and color. Only after quality eight is the less fixed. Acidity is more quality, but for quality 9, that was not the case.

## ensure quality as a factor
wines$quality <- as.factor(wines$quality)
wines %>% ggplot(aes(x=quality, y=fixed.acidity,  color=quality)) + geom_boxplot()

A box plot of volatile.acidity vs quality

Volatile Acidity evaporates quickly and contributes to a wine’s aroma—the less volatile the acidity, the higher the quality.

wines %>% ggplot(aes(x=quality, y=volatile.acidity,  color=quality)) + geom_boxplot()

A box plot of citric.acidity vs. quality

Citric Acid can be used for acidification in wines that are naturally lacking in Acid. It adds liveliness and freshness to the wine, bringing a fresher, fruity citrus note. The more citric Acid there is, the higher the quality.

wines %>% ggplot(aes(x=quality, y=citric.acid,  color=quality)) + geom_boxplot()

A box plot of residual sugar vs quality

Residual sugar in wine is from natural grape sugars left in a wine after the alcoholic fermentation finishes. The medians throughout the quality doesn’t show much difference.

wines %>% ggplot(aes(x=quality, y=residual.sugar,  color=quality)) + geom_boxplot()

A box plot of chlorides vs quality

The amount of chloride in wine is influenced by the type of grape, the vineyard’s location, and surrounding soil and water conditions. According to research, a high level of wine can lead to a decrease in its market appeal. The boxplots display that lower chlorides have better quality.

wines %>% ggplot(aes(x=quality, y=chlorides,  color=quality)) + geom_boxplot()

A box plot of FSO2 vs quality

The amount of sulfur dioxide in wine is available to protect it from oxidation. It inhibits the growth of microorganisms. The more there is free sulfur dioxide, the better quality.

wines %>% ggplot(aes(x=quality, y=free.sulfur.dioxide,  color=quality)) + geom_boxplot()

A box plot of Total Sulfur Dioxide vs quality

TSP2 is the amount of sulfur dioxide that is free in the wine and also the amount that is bound to other chemicals in the wine. It is also a preservative used in wine-making to prevent oxidation and spoilage and to maintain freshness. The higher the total sulfur dioxide, the better the quality.

wines %>% ggplot(aes(x=quality, y=total.sulfur.dioxide,  color=quality)) + geom_boxplot()

A box plot of Density vs quality

Density is about concentration and a fuller body. Better quality wines displayed lesser density.

wines %>% ggplot(aes(x=quality, y=density,  color=quality)) + geom_boxplot()

A box plot of PH vs quality

Lower pH makes the wine more stable and protects it against bacteria. The median distribution of quality shows a curve pattern. It shows a decreasing trend from quality 3 to 5, but after quality 5, the quality is better as pH increases. However, since quality 3 and quality 9 have similar pH median.

wines %>% ggplot(aes(x=quality, y=pH,  color=quality)) + geom_boxplot()

A box plot of sulphates vs quality

Sulfates are a group of chemical compounds, including sulfur dioxide. Yeast produces sulfites during fermentation. However, there are some differences in distribution. The median of each quality doesn’t show the big difference.

wines %>% ggplot(aes(x=quality, y=sulphates,  color=quality)) + geom_boxplot()

A box plot of alcohol vs quality

There is a curve pattern. Until wine quality 5, the alcohol decreases, but from quality 6, it increases.

wines %>% ggplot(aes(x=quality, y=alcohol,  color=quality)) + geom_boxplot()

Boxplots of Quality vs each categorical variables

Clean data: califormia -> california

table(wines$quality, wines$type)
##    
##      red white
##   3    9    16
##   4   44   137
##   5  554  1225
##   6  536  1861
##   7  181   738
##   8   16   142
##   9    0     4
table(wines$quality, wines$location)
##    
##     Califormia California Texas
##   3          2          0    23
##   4          2         33   146
##   5         10        450  1319
##   6          0       1939   458
##   7          0        878    41
##   8          0         83    75
##   9          0          4     0
wines$location <- gsub("Califormia", "California", wines$location, ignore.case=TRUE)
table(wines$quality, wines$location)
##    
##     California Texas
##   3          2    23
##   4         35   146
##   5        460  1319
##   6       1939   458
##   7        878    41
##   8         83    75
##   9          4     0

A barchart of type vs quality

Mostly white wine shows higher quality, however we need to cautious with sample size difference.

# Assuming 'wines' is your data frame
wines_summary <- wines %>%
  group_by(quality, type) %>%
  summarise(count = n(), .groups = "drop") %>%
  mutate(percentage = count / sum(count)) # Optional: Calculate proportions

wines_summary %>%
  ggplot(aes(x = quality, y = count, fill = type)) +
  geom_bar(stat = "identity", position = "fill") +
  geom_text(aes(label = count), 
            position = position_fill(vjust = 0.5), # Position the labels inside the bars
            color = "white") +
  ggtitle("A Barchart of Quality by Type") +
  xlab("Quality") +
  ylab("Proportion") +
  theme_bw()

A barchart of location vs quality

Mostly California has higher distribution of quality

wines_summary <- wines %>%
  group_by(quality, location) %>%
  summarise(count = n(), .groups = "drop") %>%
  mutate(percentage = count / sum(count)) # Optional: Calculate proportions

wines_summary %>%
  ggplot(aes(x = quality, y = count, fill = location)) +
  geom_bar(stat = "identity", position = "fill") +
  geom_text(aes(label = count), 
            position = position_fill(vjust = 0.5), # Position the labels inside the bars
            color = "white") +
  ggtitle("A Barchart of Quality by Location") +
  xlab("Quality") +
  ylab("Proportion") +
  theme_bw()

Correlation matrix between columns

  • Density vs. alcohol: -0.70
  • Density vs. residual.sugar: 0.55
  • total.sulfur vs. residual.sugar : 0.50
  • Density vs. fixed.acidity : 0.47
  • Quality vs. alcohol : 0.44
  • Free.sulfur vs. residual.sugar : 0.41
  • Sulphates vs. chlorides : 0.40
excluded_columns <- c("ID","type", "location")
selected_data <- wines[, setdiff(names(wines), excluded_columns)]

str(selected_data)
## 'data.frame':    5463 obs. of  12 variables:
##  $ fixed.acidity       : num  7.2 6 6.9 6.6 7.1 6 7.2 6.8 9.1 7.8 ...
##  $ volatile.acidity    : num  0.34 0.27 0.26 0.25 0.17 0.29 0.57 0.45 0.27 0.32 ...
##  $ citric.acid         : num  0.34 0.28 0.49 0.34 0.43 0.25 0.06 0.3 0.32 0.33 ...
##  $ residual.sugar      : num  12.6 4.8 1.6 3 1.3 1.4 1.6 11.8 1.1 10.4 ...
##  $ chlorides           : num  0.048 0.063 0.058 0.054 0.023 0.033 0.076 0.094 0.031 0.031 ...
##  $ free.sulfur.dioxide : num  7 31 39 22 33 30 9 23 15 47 ...
##  $ total.sulfur.dioxide: num  41 201 166 141 132 114 27 97 151 194 ...
##  $ density             : num  0.994 0.996 0.997 0.993 0.991 ...
##  $ pH                  : num  3.19 3.69 3.65 3.26 3.11 3.08 3.36 3.09 3.03 3.07 ...
##  $ sulphates           : num  0.4 0.71 0.52 0.47 0.56 0.43 0.7 0.44 0.41 0.58 ...
##  $ alcohol             : num  11.7 10 9.4 10.4 11.7 13.2 9.6 9.6 10.6 9.6 ...
##  $ quality             : Factor w/ 7 levels "3","4","5","6",..: 3 3 2 4 4 4 4 3 3 4 ...
selected_data$quality <- as.numeric(selected_data$quality)

# Compute correlation matrix for the selected columns
cor_matrix <- cor(selected_data)
print(cor_matrix)
##                      fixed.acidity volatile.acidity  citric.acid residual.sugar
## fixed.acidity           1.00000000       0.21823464  0.333199973    -0.11266700
## volatile.acidity        0.21823464       1.00000000 -0.373323484    -0.20136563
## citric.acid             0.33319997      -0.37332348  1.000000000     0.13228846
## residual.sugar         -0.11266700      -0.20136563  0.132288464     1.00000000
## chlorides               0.29605993       0.37292022  0.051041175    -0.12640693
## free.sulfur.dioxide    -0.28288825      -0.34540899  0.122127249     0.40675420
## total.sulfur.dioxide   -0.33361204      -0.40382125  0.177334114     0.49715732
## density                 0.46627359       0.26191157  0.097985259     0.54533710
## pH                     -0.25210981       0.25590178 -0.322531623    -0.27318560
## sulphates               0.30405382       0.22359740  0.069782595    -0.18947356
## alcohol                -0.08897084      -0.03713716 -0.002550528    -0.36869902
## quality                -0.07128477      -0.26144015  0.090534263    -0.03622339
##                        chlorides free.sulfur.dioxide total.sulfur.dioxide
## fixed.acidity         0.29605993         -0.28288825          -0.33361204
## volatile.acidity      0.37292022         -0.34540899          -0.40382125
## citric.acid           0.05104118          0.12212725           0.17733411
## residual.sugar       -0.12640693          0.40675420           0.49715732
## chlorides             1.00000000         -0.19023849          -0.27194647
## free.sulfur.dioxide  -0.19023849          1.00000000           0.71900054
## total.sulfur.dioxide -0.27194647          0.71900054           1.00000000
## density               0.36474494          0.03235223           0.03253618
## pH                    0.03443033         -0.14575035          -0.23875927
## sulphates             0.39911411         -0.18983600          -0.28530444
## alcohol              -0.25496175         -0.18704256          -0.27434630
## quality              -0.19869648          0.04148439          -0.05520419
##                           density           pH    sulphates      alcohol
## fixed.acidity         0.466273587 -0.252109810  0.304053815 -0.088970837
## volatile.acidity      0.261911567  0.255901781  0.223597402 -0.037137156
## citric.acid           0.097985259 -0.322531623  0.069782595 -0.002550528
## residual.sugar        0.545337095 -0.273185595 -0.189473558 -0.368699015
## chlorides             0.364744942  0.034430326  0.399114115 -0.254961753
## free.sulfur.dioxide   0.032352229 -0.145750353 -0.189836005 -0.187042561
## total.sulfur.dioxide  0.032536183 -0.238759273 -0.285304438 -0.274346298
## density               1.000000000  0.002279624  0.262176353 -0.697516955
## pH                    0.002279624  1.000000000  0.195875331  0.126339216
## sulphates             0.262176353  0.195875331  1.000000000  0.001861973
## alcohol              -0.697516955  0.126339216  0.001861973  1.000000000
## quality              -0.301996919  0.025142885  0.043687536  0.442294127
##                          quality
## fixed.acidity        -0.07128477
## volatile.acidity     -0.26144015
## citric.acid           0.09053426
## residual.sugar       -0.03622339
## chlorides            -0.19869648
## free.sulfur.dioxide   0.04148439
## total.sulfur.dioxide -0.05520419
## density              -0.30199692
## pH                    0.02514289
## sulphates             0.04368754
## alcohol               0.44229413
## quality               1.00000000
# Visualize correlation matrix

# Install and load the corrplot package
#install.packages("corrplot")
library(corrplot)

# Plot the correlation matrix
corrplot(
  cor_matrix, 
  method = "circle",
  addCoef.col = "black",
  number.cex = 0.8,          # Adjust text size for values
  tl.col = "black",          # Black color for labels
  tl.cex = 0.8  
  )

A boxplot vs Type grouped by locatioin

# Fixed Acidity
# Wines from cool-climate grapes are usually high in acidity, while wines from warm-climate grapes can be low in acid.

# Red has higher fixed acidity in general. However, by location, California has higher red wine, and Texas has higher fixed acidity with white wine.
wines %>% ggplot(aes(x=type, y=fixed.acidity, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Fixed Acidity by Type", x = "Type", y="Fixed Acidity", color = "Location")

# Volatile Acidity

# Red has higher volatile acidity, and Texas has higher volatile acidity in both red and white
# Legal Limits: 1.2grams per liter
wines %>% ggplot(aes(x=type, y=volatile.acidity, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Volatile Acidity by Type", x = "Type", y="Volatile Acidity", color = "Location")

# Citric Acid
# California tends to have higher cTexas acidity in both red and white. The Texas red wine tends to have a lower distribution of citric acid.
# I see unusual outliers in California white wine.

# California tends to add more citric acid than Texas, 
wines %>% ggplot(aes(x=type, y=citric.acid, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Citric Acid by Type", x = "Type", y="Citric Acid", color = "Location")

# Residual Sugar
# From natural grape sugars leftover

# Red wine has very low residual sugar, while white has higher residual sugar. 
# California white wine shows some outliers

# White wine tends to have more natural grape sugars left over.
wines %>% ggplot(aes(x=type, y=residual.sugar, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Residual Sugar by Type", x = "Type", y="Residual Sugar", color = "Location")

# Chlorides
# White wine tends to have lower chlorides compared to red wine, and among them, texas has higher distribution of chlorides in both red and white
#red wine is slatier
# texas tends to have more chlorides. Texas wine is saltier
# Usually less than 500 mg/L
wines %>% ggplot(aes(x=type, y=chlorides, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Chlorides by Type", x = "Type", y="Chlorides", color = "Location")

# Free Sulfur Dioxide

# White Free Sulfur Dioxide is higher than red. Texas white wine has a broader distribution of free sulfur dioxide.
# There is a significant outlier in Texas for white-free sulfur dioxide.
wines %>% ggplot(aes(x=type, y=free.sulfur.dioxide, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Free Sulfur Dioxide by Type", x = "Type", y="Free Sulfur Dioxide", color = "Location")

# Total Sulfur Dioxide

# White wine has higher Total Sulfur Dioxide. Texas tends to have higher sulfur dioxides.
# White wine needs more preservatives to prevent oxidation and spoilage and to maintain freshness.
wines %>% ggplot(aes(x=type, y=total.sulfur.dioxide, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Total Sulfur Dioxide by Type", x = "Type", y="Total Sulfur Dioxide", color = "Location")

# Density
# Red wine has a higher density, and Texas wine has a higher density than California

#Red wine is generally considered denser than white wine because, during its production, the grape skins are left in contact with the juice during fermentation, which extracts tannins from the skins, resulting in a "fuller body" and denser texture compared to white wine where the skins are removed, leaving a lighter wine with less tannin content.
wines %>% ggplot(aes(x=type, y=density, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Density by Type", x = "Type", y="Density", color = "Location")

# pH

# Red wine has higher pH => meaning lower acidity. The location doesn't show much difference here.
wines %>% ggplot(aes(x=type, y=pH, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of pH by Type", x = "Type", y="pH", color = "Location")

# Sulphates
# Red wine has higher sulfates, and California has higher sulfates in both red and white wines.
# Wine with higher acidity requires less sulfates than wine with lower acidity
# Wines with more sugar need more sulfites to prevent secondary fermentation.
wines %>% ggplot(aes(x=type, y=sulphates, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Sulphates by Type", x = "Type", y="Sulphates", color = "Location")

# Alcohol
# high sugar => high alcohol
# Cooler climates make it more challenging for grapes to ripen, so the fruit is often harvested with lower sugar levels. => low alcohol level
# Warmer climates allow grapes to ripen longer on the vine, producing higher alcohol levels.
# California has higher alcohol!
wines %>% ggplot(aes(x=type, y=alcohol, fill=location)) + geom_boxplot() + theme_bw() + labs(title = "A Boxplot of Alcohol by Type", x = "Type", y="Alcohol", color = "Location")

A 3D scatterplot that shows high correlation among density, TSO2, Volatile Acidity

plot_ly(data = wines,
        x = ~density,
        y = ~total.sulfur.dioxide,
        z = ~volatile.acidity,
        type = "scatter3d",
        mode = "markers",
        color = ~type, # Use color for location
        colors = c("#C8102E", "#0033A0"), # Specify desired colors
        #symbol = ~type,    # Use shape for type (red/white)
        marker = list(size = 4, opacity = 0.7)) %>%
  layout(title = "3D Scatterplot: Type of Wine by Density, Total SO2, and Volatile Acidity",
         scene = list(xaxis = list(title = "Density"),
                      yaxis = list(title = "Total SO2"),
                      zaxis = list(title = "Volatile Acidity")),
         legend = list(title = list(text = "Legend")))

2 Dimensional of 3 variables with type

#Density: The highest point looks like outliers. => more investigation needed.
wines %>% ggplot(aes(x=type, y=density, fill=type)) + geom_boxplot() + ggtitle("A boxplot of density by type" ) +xlab("Type") + ylab("Density") +  theme_bw() 

#SO2: The Legal limit is known as 350. Hence the outlier we see in white wine, we should be cautious.
wines %>% ggplot(aes(x=type, y=total.sulfur.dioxide, fill=type)) + geom_boxplot() + ggtitle("A boxplot of total SO2 by type" ) +xlab("Type") + ylab("Total SO2") +  theme_bw() 

#  Volatile Acidity: There's an outlier around 1.6 in red wine. Since red wine's legal limit is 1.4mg/L, we should consider whether we still want this outlier or not.
wines %>% ggplot(aes(x=type, y=volatile.acidity, fill=type)) + geom_boxplot() + ggtitle("A boxplot of volatile acidity by type" ) +xlab("Type") + ylab("Volatile Acidity") +  theme_bw()

More investigation in Density

As per domain knowledge, the high density will have high residual sugar. According to the outlier point, it has a high residual point, which was an influential point in residual sugar.

subset_wines <- wines %>% filter(density > 1.01)
print(subset_wines)
##     ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 3152           7.9             0.33        0.28           31.6     0.053
## 2 5151           7.9             0.33        0.28           31.6     0.053
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol  type
## 1                  35                  176  1.0103 3.15      0.38     8.8 white
## 2                  35                  176  1.0103 3.15      0.38     8.8 white
##     location quality
## 1 California       6
## 2 California       6
wines %>% ggplot(aes(x=type, y=residual.sugar, fill=type)) + geom_boxplot() + ggtitle("A boxplot of residual sugar by type" ) +xlab("Type") + ylab("Residual Sugar") +  theme_bw()

wines %>% ggplot(aes(x=type, y=free.sulfur.dioxide, fill=type)) + geom_boxplot() + ggtitle("A boxplot of residual sugar by type" ) +xlab("Type") + ylab("Residual Sugar") +  theme_bw()

# Positive relationship
wines %>% ggplot(aes(x = density , y=residual.sugar, color=type)) + geom_point() + ggtitle("A scatterplot between Density and Residual Sugar") + xlab("Density") + ylab("Residual Sugar")

Wine Quality

#Quality
ggplot(wines, aes(x = quality)) + 
  geom_bar(fill = "royalblue") + 
  labs(title = "Distribution of Wine Quality", x = "Quality", y = "Count")

GGpair plot

# Create a pairplot for quality, type, and location
ggpairs(wines, 
        columns = c("quality", "type", "location"), 
        mapping = aes(color = type), 
        lower = list(continuous = wrap("points", alpha = 0.6)),
        upper = list(continuous = wrap("cor", size = 3))) +
        labs(title = "Wine Quality by Location and Type")

EDA by Location

ggpairs(
  wines,
  columns = c('fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar', 'chlorides', 'free.sulfur.dioxide', 'total.sulfur.dioxide', 'density', 'pH', 'sulphates', 'alcohol','quality'),
  aes(color = location, alpha = .8),  
  title = "Pairwise Relationships with Quality as Color",
  lower = list(continuous = "smooth"),
  upper = list(continuous = "cor")
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Calculate mean quality and count by location
summary_table <- wines %>%
  group_by(location) %>%
  summarise(
    count = n(),
    mean_quality = mean(quality, na.rm = TRUE)
  )
## Warning: There were 2 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `mean_quality = mean(quality, na.rm = TRUE)`.
## ℹ In group 1: `location = "California"`.
## Caused by warning in `mean.default()`:
## ! argument is not numeric or logical: returning NA
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Display the table using kable
kable(summary_table, format = "pipe", col.names = c("Location", "Count", "Mean Quality"))
Location Count Mean Quality
California 3401 NA
Texas 2062 NA
ggplot(wines, aes(x = as.factor(quality), fill = location)) +
  geom_bar(position = "fill", color = "black", width = 0.7) +  # Bar border color and width
  geom_text(stat = "count", aes(label = ..count..), 
            position = position_fill(vjust = 0.5), size = 3, color = "white", fontface = "bold") +  # Text styling
  labs(
    title = "Quality by Location: California vs. Texas",
    x = "Wine Quality",
    y = "Proportion",
    fill = "Location"
  ) +
  scale_fill_manual(values = c("California" = "#0033A0", "Texas" = "#C8102E")) +  # Custom colors for locations
  theme_minimal() +  # Use a minimal theme
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),  # Title styling
    axis.title = element_text(size = 12),  # Axis title size
    axis.text = element_text(size = 10),   # Axis text size
    legend.position = "top"  # Move the legend to the top
  )
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(wines, aes(x = alcohol, y = volatile.acidity, color = location)) +
  geom_point(alpha = .6) +
  labs(
    title = "Scatterplot of Factor 1 vs. Factor 2",
    x = "Factor 1",
    y = "Factor 2",
    color = "Location"
  ) +
  theme_solarized()

plot_ly(data = wines, 
        x = ~alcohol, 
        y = ~density,
        z = ~volatile.acidity, 
        type = "scatter3d", 
        mode = "markers",
        color = ~location, 
        colors = c("#0033A0", "#C8102E"),
        marker = list(size = 4, opacity = 0.5)) |>
  layout(title = "3D Scatterplot: Location of Wine by Alcohol, Density, and Volatile Acidity",
         scene = list(xaxis = list(title = "Alcohol"),
                      yaxis = list(title = "Density"),
                      zaxis = list(title = "Volatile Acidity")),
         legend = list(title = list(text = "Legend")))

EDA by Quality

# create a 3D scatterplot for quality
plot_ly(data = wines, 
        x = ~volatile.acidity, 
        y = ~density,
        z = ~alcohol, 
        type = "scatter3d", 
        mode = "markers",
        color = ~as.factor(quality), 
        colors = c("#FF0000",  "#FF5733", "#FFFF33","#33FF57","#33FFF6","#0099ff", "#9900ff"),
        marker = list(size = 4,  opacity = 0.8)) |>
  layout(title = "3D Scatterplot: Location of Wine by Volatile Acidity, Density, and Alcohol",
         scene = list(xaxis = list(title = "Volatile Acidity"),
                      yaxis = list(title = "Density"),
                      zaxis = list(title = "Alcohol")),
         legend = list(title = list(text = "Legend")))

Create a correlation matrix to select variables for the Linear Regression Model

# Encode 'location' and 'type' as a numeric values
wines$type <- as.numeric(as.factor(wines$type))
wines$location <- as.numeric(as.factor(wines$location))
numeric_columns <- wines[, sapply(wines, is.numeric)]
# Compute the correlation matrix
cor_matrix <- cor(numeric_columns, use = "complete.obs")
# Visualize the correlation matrix with numbers
library(corrplot)
corrplot(cor_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45,
         title = "Correlation Matrix Including Location and Type", mar = c(0, 0, 1, 0),
         addCoef.col = "black", 
         number.cex = 0.7,
         col = colorRampPalette(c("#B2182B", "#D6604D", "#F4A582", "#FDDBC7", "#FFFFFF", "#D1E5F0", "#92C5DE", "#4393C3", "#2166AC"))(200))